## Error: package 'xts' required by 'PerformanceAnalytics' could not be found
Back in 2014, fivethiryeight.com published an article on alchohol consumption in different countries. The data drinks is available as part of the fivethirtyeight package. Make sure you have installed the fivethirtyeight package before proceeding.
library(fivethirtyeight)
data(drinks)
# or download directly
alcohol_direct <- read_csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/alcohol-consumption/drinks.csv")The skim function was useful for understanding the variable types and checking for missing values.
| Name | alcohol_direct |
| Number of rows | 193 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| country | 0 | 1 | 3 | 28 | 0 | 193 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| beer_servings | 0 | 1 | 106.16 | 101.14 | 0 | 20.0 | 76.0 | 188.0 | 376.0 | ▇▃▂▂▁ |
| spirit_servings | 0 | 1 | 80.99 | 88.28 | 0 | 4.0 | 56.0 | 128.0 | 438.0 | ▇▃▂▁▁ |
| wine_servings | 0 | 1 | 49.45 | 79.70 | 0 | 1.0 | 8.0 | 59.0 | 370.0 | ▇▁▁▁▁ |
| total_litres_of_pure_alcohol | 0 | 1 | 4.72 | 3.77 | 0 | 1.3 | 4.2 | 7.2 | 14.4 | ▇▃▅▃▁ |
# There are two variable types, numeric and character. The country column is a character type and the other four column (beer, wine, spirit, and total) are numeric.
# No missing values to worry about. Plot that shows the top 25 beer consuming countries
## Rows: 193
## Columns: 5
## $ country <chr> "Afghanistan", "Albania", "Algeria", "An…
## $ beer_servings <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, …
## $ spirit_servings <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75…
## $ wine_servings <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 19…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8,…
ggplot(top_25_beer, aes(x =fct_reorder(country,beer_servings),y=beer_servings))+
geom_col()+
geom_smooth(se= FALSE)+
theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
labs(title = "Top 25 Beer Consuming Countries", x= "Countries", y= "Beer Servings" )+
NULLPlot that shows the top 25 wine consuming countries
## Rows: 193
## Columns: 5
## $ country <chr> "Afghanistan", "Albania", "Algeria", "An…
## $ beer_servings <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, …
## $ spirit_servings <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75…
## $ wine_servings <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 19…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8,…
ggplot(top_25_wine, aes(x =fct_reorder(country,wine_servings),y=wine_servings))+
geom_col()+
geom_smooth(se= FALSE)+
theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
labs(title = "Top 25 Wine Consuming Countries", x= "Countries", y= "Wine Servings" )+
NULLPlot that shows the top 25 spirit consuming countries
## Rows: 193
## Columns: 5
## $ country <chr> "Afghanistan", "Albania", "Algeria", "An…
## $ beer_servings <dbl> 0, 89, 25, 245, 217, 102, 193, 21, 261, …
## $ spirit_servings <dbl> 0, 132, 0, 138, 57, 128, 25, 179, 72, 75…
## $ wine_servings <dbl> 0, 54, 14, 312, 45, 45, 221, 11, 212, 19…
## $ total_litres_of_pure_alcohol <dbl> 0.0, 4.9, 0.7, 12.4, 5.9, 4.9, 8.3, 3.8,…
ggplot(top_25_spirit, aes(x =fct_reorder(country,spirit_servings),y=spirit_servings))+
geom_col()+
geom_smooth(se= FALSE)+
theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
labs(title = "Top 25 Spirit Consuming Countries", x= "Countries", y= "Spirit Servings" )+
NULLThe graphs depict that, on average, Namibians consumer the most beer, the French consume the most wine, and Grenadians consume the most spirits. These results may be a consequence of availability of particular ingredients (sugarcane,hops, barley, grapes), implications of specific climates, or cultural influences. Other countries have increasingly stringent or relaxed regulations on the different types of alcohol which alter the consumption habits of consumers.
We will look at a subset sample of movies, taken from the Kaggle IMDB 5000 movie dataset
This loads the dataset movies.
Skim was used to check for missing value, of which there were no missing values. However, there are duplicate values for the three character type columns.
## Error in skim(movies): object 'movies' not found
This is used for displaying a series of tables concerning movies.
# Table with the count of movies by genre, ranked in descending order
movies %>%
group_by(genre) %>%
count(sort = TRUE)## Error in eval(lhs, parent, parent): object 'movies' not found
# Table with the average gross earning and budget (`gross` and `budget`) by genre
movies %>%
group_by(genre) %>%
summarize(mean_budget = mean(budget), mean_gross_earning = mean(gross), return_on_budget = mean_gross_earning/mean_budget) %>%
arrange(desc(return_on_budget))## Error in eval(lhs, parent, parent): object 'movies' not found
# Table that shows the top 15 directors who have created the highest gross revenue in the box office
top_15_directors <- movies %>%
group_by(director) %>%
summarize(mean_director = mean(gross), standard_dev = sd(gross), median = median(gross), total = sum(gross)) %>%
arrange(desc(total)) %>%
slice(1:15)## Error in eval(lhs, parent, parent): object 'movies' not found
## Error in eval(expr, envir, enclos): object 'top_15_directors' not found
# Table that describes how ratings are distributed by genre.
rating <- movies %>%
group_by(genre) %>%
summarize(min = min(rating), max = max(rating), median = median(rating), standard_dev = sd(rating))## Error in eval(lhs, parent, parent): object 'movies' not found
## Error in ggplot(movies, aes(x = rating)): object 'movies' not found
We have placed “gross” on the y-axis and “cast_facebook_likes” on the x-axis. The amount of Facebook likes the cast has received is likely not the strongest indicator of high gross revenues.
ggplot(movies, aes(x = cast_facebook_likes, y= gross))+
geom_point()+
scale_x_log10()+
scale_y_log10()## Error in ggplot(movies, aes(x = cast_facebook_likes, y = gross)): object 'movies' not found
The correlation between gross revenues and budget seems to be stronger than the previous correlation with Facebook likes.
## Error in ggplot(movies, aes(x = gross, y = budget)): object 'movies' not found
There seems to be a small relationship between gross revenue and rating when faceted by genre. Depending on the genre the correlation slightly changes, or there isn’t enough data in particular genres to make a determination.
ggplot(movies, aes(x = gross, y= rating))+
geom_point()+
facet_wrap(vars(genre))+
scale_x_log10()+
scale_y_log10()## Error in ggplot(movies, aes(x = gross, y = rating)): object 'movies' not found
Returns of financial stocks
Table and a bar plot that shows the number of companies per sector, in descending order
chart_sector <- nyse %>%
group_by (sector) %>%
summarise(number_of_companies = n()) %>%
arrange(desc(number_of_companies))
ggplot (chart_sector, aes( x = fct_reorder(sector, -number_of_companies), y = number_of_companies))+
geom_col()+
labs(title = "Companies Per Sector", x= "Sector", y= "Number Per Sector" )+
theme(axis.text.x = element_text(angle = 90))Gathers the tickers from Wikipedia
djia_url <- "https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average"
#get tables that exist on URL
tables <- djia_url %>%
read_html() %>%
html_nodes(css="table")
# parse HTML tables into a dataframe called djia.
# Use purr::map() to create a list of all tables in URL
djia <- map(tables, . %>%
html_table(fill=TRUE)%>%
clean_names())
# constituents
table1 <- djia[[2]] %>% # the second table on the page contains the ticker symbols
mutate(date_added = ymd(date_added),
# if a stock is listed on NYSE, its symbol is, e.g., NYSE: MMM
# We will get prices from yahoo finance which requires just the ticker
# if symbol contains "NYSE*", the * being a wildcard
# then we jsut drop the first 6 characters in that string
ticker = ifelse(str_detect(symbol, "NYSE*"),
str_sub(symbol,7,11),
symbol)
)
# we need a vector of strings with just the 30 tickers + SPY
tickers <- table1 %>%
select(ticker) %>%
pull() %>% # pull() gets them as a sting of characters
c("SPY") # and lets us add SPY, the SP500 ETFPrice data on stocks
# Notice the cache=TRUE argument in the chunk options. Because getting data is time consuming, # cache=TRUE means that once it downloads data, the chunk will not run again next time you knit your Rmd
myStocks <- tickers %>%
tq_get(get = "stock.prices",
from = "2000-01-01",
to = "2020-08-31") %>%
group_by(symbol) ## Error in tq_get(., get = "stock.prices", from = "2000-01-01", to = "2020-08-31"): could not find function "tq_get"
## Error in glimpse(myStocks): object 'myStocks' not found
Calculation of different return periods
#calculate daily returns
myStocks_returns_daily <- myStocks %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "daily",
type = "log",
col_rename = "daily_returns",
cols = c(nested.col)) ## Error in eval(lhs, parent, parent): object 'myStocks' not found
#calculate monthly returns
myStocks_returns_monthly <- myStocks %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "monthly",
type = "arithmetic",
col_rename = "monthly_returns",
cols = c(nested.col)) ## Error in eval(lhs, parent, parent): object 'myStocks' not found
#calculate yearly returns
myStocks_returns_annual <- myStocks %>%
group_by(symbol) %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "yearly",
type = "arithmetic",
col_rename = "yearly_returns",
cols = c(nested.col))## Error in eval(lhs, parent, parent): object 'myStocks' not found
Summarized monthly returns since 2017-01-01 for each of the stocks and SPY; min, max, median, mean, SD.
summarise_monthly_returns <- myStocks_returns_monthly %>%
dplyr::group_by(symbol) %>%
dplyr::filter(date >= as.Date("2017-01-01")) %>%
summarise(min_return = min(monthly_returns), max_return = max(monthly_returns), median_return = median(monthly_returns), mean_return = mean(monthly_returns), std_return = sd(monthly_returns))## Error in eval(lhs, parent, parent): object 'myStocks_returns_monthly' not found
## Error in eval(expr, envir, enclos): object 'summarise_monthly_returns' not found
Density plot, using geom_density(), for each of the stocks
ggplot(myStocks_returns_monthly, aes( x = monthly_returns)) +
geom_density() +
facet_wrap(~symbol) +
labs(title = "Density plot of the distribution of monthly returns",
y = "Density", x = "Monthly Returns") +
theme(plot.title = element_text(hjust = 0.5, size = 12, face = "bold"))## Error in ggplot(myStocks_returns_monthly, aes(x = monthly_returns)): object 'myStocks_returns_monthly' not found
We can infer from the plot that the “SPY” is the lest risky since the monthly return distribution is the most narrow, whereas the “DOW” has the widest distribution making it the most risky.
Plot that shows the expected monthly return (mean) of a stock on the Y axis and the risk (standard deviation) in the X-axis.
myStocks_returns_monthly %>%
group_by(symbol) %>%
summarise(mean=mean(monthly_returns),sd=sd(monthly_returns)) %>%
ggplot(aes(x=sd,y=mean,label=symbol)) +
geom_point() +
geom_smooth() +
ggrepel::geom_text_repel(show.legend = FALSE,size = 5) +
labs(title = "Expected Monthly Returns vs Risk ",
y = "Expected monthly returns",
x = "Risk") +
theme(plot.title = element_text(hjust = 0.5, size = 10, face = "bold"))## Error in eval(lhs, parent, parent): object 'myStocks_returns_monthly' not found
From this plot we can say that usually when the risk is low the expected monthly return is also low (e.g. SPY, JNJ) and vice versa (e.g. AAPL, CRM). However, DOW is risky but the expected monthly return is not very high. On the other hand, V, UNH and NKE have high expected return compared to their risk profile.
For this task, you will analyse a data set on Human Resoruce Analytics. The IBM HR Analytics Employee Attrition & Performance data set is a fictional data set created by IBM data scientists. Among other things, the data set includes employees’ income, their distance from work, their position in the company, their level of education, etc. A full description can be found on the website.
hr_dataset <- read_csv("datasets_1067_1925_WA_Fn-UseC_-HR-Employee-Attrition.csv")
glimpse(hr_dataset)## Rows: 1,470
## Columns: 35
## $ Age <dbl> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, …
## $ Attrition <chr> "Yes", "No", "Yes", "No", "No", "No", "No", …
## $ BusinessTravel <chr> "Travel_Rarely", "Travel_Frequently", "Trave…
## $ DailyRate <dbl> 1102, 279, 1373, 1392, 591, 1005, 1324, 1358…
## $ Department <chr> "Sales", "Research & Development", "Research…
## $ DistanceFromHome <dbl> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26,…
## $ Education <dbl> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3,…
## $ EducationField <chr> "Life Sciences", "Life Sciences", "Other", "…
## $ EmployeeCount <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ EmployeeNumber <dbl> 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16…
## $ EnvironmentSatisfaction <dbl> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3,…
## $ Gender <chr> "Female", "Male", "Male", "Female", "Male", …
## $ HourlyRate <dbl> 94, 61, 92, 56, 40, 79, 81, 67, 44, 94, 84, …
## $ JobInvolvement <dbl> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2,…
## $ JobLevel <dbl> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1,…
## $ JobRole <chr> "Sales Executive", "Research Scientist", "La…
## $ JobSatisfaction <dbl> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3,…
## $ MaritalStatus <chr> "Single", "Married", "Single", "Married", "M…
## $ MonthlyIncome <dbl> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 26…
## $ MonthlyRate <dbl> 19479, 24907, 2396, 23159, 16632, 11864, 996…
## $ NumCompaniesWorked <dbl> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5,…
## $ Over18 <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y",…
## $ OverTime <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes"…
## $ PercentSalaryHike <dbl> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, …
## $ PerformanceRating <dbl> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3,…
## $ RelationshipSatisfaction <dbl> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2,…
## $ StandardHours <dbl> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, …
## $ StockOptionLevel <dbl> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0,…
## $ TotalWorkingYears <dbl> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, …
## $ TrainingTimesLastYear <dbl> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4,…
## $ WorkLifeBalance <dbl> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3,…
## $ YearsAtCompany <dbl> 6, 10, 0, 8, 2, 7, 1, 1, 9, 7, 5, 9, 5, 2, 4…
## $ YearsInCurrentRole <dbl> 4, 7, 0, 7, 2, 7, 0, 0, 7, 7, 4, 5, 2, 2, 2,…
## $ YearsSinceLastPromotion <dbl> 0, 1, 0, 3, 2, 3, 0, 0, 1, 7, 0, 0, 4, 1, 0,…
## $ YearsWithCurrManager <dbl> 5, 7, 0, 0, 2, 6, 0, 0, 8, 7, 3, 8, 3, 2, 3,…
Clean the data set
hr_cleaned <- hr_dataset %>%
clean_names() %>%
mutate(
education = case_when(
education == 1 ~ "Below College",
education == 2 ~ "College",
education == 3 ~ "Bachelor",
education == 4 ~ "Master",
education == 5 ~ "Doctor"
),
environment_satisfaction = case_when(
environment_satisfaction == 1 ~ "Low",
environment_satisfaction == 2 ~ "Medium",
environment_satisfaction == 3 ~ "High",
environment_satisfaction == 4 ~ "Very High"
),
job_satisfaction = case_when(
job_satisfaction == 1 ~ "Low",
job_satisfaction == 2 ~ "Medium",
job_satisfaction == 3 ~ "High",
job_satisfaction == 4 ~ "Very High"
),
performance_rating = case_when(
performance_rating == 1 ~ "Low",
performance_rating == 2 ~ "Good",
performance_rating == 3 ~ "Excellent",
performance_rating == 4 ~ "Outstanding"
),
work_life_balance = case_when(
work_life_balance == 1 ~ "Bad",
work_life_balance == 2 ~ "Good",
work_life_balance == 3 ~ "Better",
work_life_balance == 4 ~ "Best"
)
) %>%
select(age, attrition, daily_rate, department,
distance_from_home, education,
gender, job_role,environment_satisfaction,
job_satisfaction, marital_status,
monthly_income, num_companies_worked, percent_salary_hike,
performance_rating, total_working_years,
work_life_balance, years_at_company,
years_since_last_promotion)Summary describing this dataset.
#1. Shows often do people leave the company (`attrition`)
attrition_rate<-hr_cleaned %>%
mutate(people_leave=ifelse(attrition=="Yes",1,0))%>%
summarize(attrition_rate=sum(people_leave)/count(hr_cleaned))
#2. Shows how `age`, `years_at_company`, `monthly_income` and `years_since_last_promotion` distributed. Looking at the summary statistics, we can see "Age" is the variable closer to normal distribution since its mean and median are closer together.
summary(hr_cleaned %>%
select(age,years_at_company,monthly_income,years_since_last_promotion))## age years_at_company monthly_income years_since_last_promotion
## Min. :18.0 Min. : 0 Min. : 1009 Min. : 0.00
## 1st Qu.:30.0 1st Qu.: 3 1st Qu.: 2911 1st Qu.: 0.00
## Median :36.0 Median : 5 Median : 4919 Median : 1.00
## Mean :36.9 Mean : 7 Mean : 6503 Mean : 2.19
## 3rd Qu.:43.0 3rd Qu.: 9 3rd Qu.: 8379 3rd Qu.: 3.00
## Max. :60.0 Max. :40 Max. :19999 Max. :15.00
#3. Shows how `job_satisfaction` and `work_life_balance` distributed.
hr_cleaned %>%
group_by(job_satisfaction) %>%
summarise(total= n()) %>%
mutate(percentage_of_total = total / sum(total))## # A tibble: 4 x 3
## job_satisfaction total percentage_of_total
## <chr> <int> <dbl>
## 1 High 442 0.301
## 2 Low 289 0.197
## 3 Medium 280 0.190
## 4 Very High 459 0.312
hr_cleaned %>%
group_by(work_life_balance) %>%
summarise(total= n()) %>%
mutate(percentage_of_total = total / sum(total))## # A tibble: 4 x 3
## work_life_balance total percentage_of_total
## <chr> <int> <dbl>
## 1 Bad 80 0.0544
## 2 Best 153 0.104
## 3 Better 893 0.607
## 4 Good 344 0.234
#4. Relationship between monthly income and education, as well as monthly income and gender. Individuals with a masters or doctorate have higher incomes than those with other degrees. Females tend to receive slightly higher monthly income.
hr_cleaned %>%
ggplot(aes(x= education , y= monthly_income)) +
geom_boxplot()+
labs(title = "Relationship between monthly income and education",subtitle ="People with higher education tend to receive higher monthly income",x = "Education",y = "Monthly Income")hr_cleaned %>%
ggplot(aes(x= gender , y= monthly_income)) +
geom_boxplot()+
labs(title = "Relationship between monthly income and gender",subtitle ="Female tend to recive slightly higher monthly income",
x = "Education",
y = "Monthly Income")#5. Boxplot of income vs job role, in descending order.
hr_cleaned %>%
ggplot(aes(x= reorder(job_role,-monthly_income) , y=monthly_income )) +
geom_boxplot()+
theme(plot.title = element_text(vjust = 1, size = 5),axis.text.x = element_text(angle = 90))+
labs(title = "Relationship between monthly income and job role",
y = "Monthly Income",
x = "Job role")#6. Bar chart of the mean (or median?) income by education level.
mean_income <- hr_cleaned %>%
group_by(education) %>%
summarise(mean_income=mean(monthly_income))
ggplot(mean_income, aes(x=reorder(education,-mean_income),y=mean_income))+
geom_col()+
labs(title = "Relationship between mean monthly income and education level",
y = "Mean Monthly Income",
x = "Education")median_income <- hr_cleaned %>%
group_by(education) %>%
summarise(median_income=median(monthly_income))
ggplot(median_income, aes(x=reorder(education,-median_income),y=median_income))+
geom_col()+
labs(title = "Relationship between median monthly income and education level",
y = "Median Monthly Income",
x = "Education")#7. The distribution of income by education level.
ggplot(hr_cleaned, aes(x=monthly_income))+
geom_density()+
facet_wrap(vars(education))+
labs(title = "Distribution of monthly income by education level ",
x= "Monthly Income",
y="Density")#8. Plot of income vs age, faceted by `job_role`
ggplot(hr_cleaned, aes(x=age,y=monthly_income))+
geom_point()+
facet_wrap(vars(job_role))+
labs(title = "Relationship between monthly income and age by job role ",
x= "Age",
y="Monthly Income")+scale_x_log10()+
scale_y_log10()Figure 3 shows the homicide and suicide rate among white men.
Creating a replication of figure 3 (homicide and suicide rate among white men).
## ST State Population.Black Population.White Deaths.homicide.Black
## 1 AK Alaska 153664 2274605 23
## 2 AK Alaska 153664 2274605 NA
## 3 AL Alabama 5376635 14267619 183
## 4 AL Alabama 5376635 14267619 1812
## 5 AR Arkansas 1991165 9747025 667
## 6 AR Arkansas 1991165 9747025 90
## Deaths.homicide.White crude.homicide.Black crude.homicide.White
## 1 78 14.97 3.43
## 2 10 NA NA
## 3 124 3.40 0.87
## 4 620 33.70 4.35
## 5 416 33.50 4.27
## 6 110 4.52 1.13
## adjusted.homicide.Black adjusted.homicide.White Deaths.suicide.Black
## 1 12.30 3.24 23
## 2 NA NA 12
## 3 3.51 0.85 148
## 4 33.00 4.47 370
## 5 33.39 4.39 118
## 6 4.81 1.13 75
## Deaths.suicide.White crude.suicide.Black crude.suicide.White
## 1 535 14.97 23.52
## 2 196 NA 8.62
## 3 1222 2.75 8.56
## 4 3195 6.88 22.39
## 5 2154 5.93 22.10
## 6 1027 3.77 10.54
## adjusted.suicide.Black adjusted.suicide.White type crude.RD.suicide
## 1 12.77 23.47 Firearm -8.55
## 2 NA 8.21 Non-Firearm NA
## 3 2.84 8.64 Non-Firearm -5.81
## 4 7.20 20.97 Firearm -15.51
## 5 6.25 20.98 Firearm -16.17
## 6 3.93 10.89 Non-Firearm -6.77
## adj.RD.suicide crude.RD.homicide adj.RD.homicide ST.order.RD.homicide
## 1 -10.70 11.54 9.06 AK
## 2 NA NA NA AK
## 3 -5.80 2.53 2.66 AL
## 4 -13.77 29.35 28.53 AL
## 5 -14.73 29.23 29.00 AR
## 6 -6.96 3.39 3.68 AR
## ST.order.RD.suicide gun.house.prev gun.house.prev.category average.pop.white
## 1 AK 59.8 45.0%-65.5% 252734
## 2 AK 59.8 45.0%-65.5% 252734
## 3 AL 52.2 45.0%-65.5% 1585291
## 4 AL 52.2 45.0%-65.5% 1585291
## 5 AR 58.8 45.0%-65.5% 1083003
## 6 AR 58.8 45.0%-65.5% 1083003
## average.pop.black type.fac
## 1 17074 Firearm-related
## 2 17074 Firearm-unrelated
## 3 597404 Firearm-unrelated
## 4 597404 Firearm-related
## 5 221241 Firearm-related
## 6 221241 Firearm-unrelated
## Rows: 104
## Columns: 28
## $ ST <chr> "AK", "AK", "AL", "AL", "AR", "AR", "AZ", "AZ…
## $ State <chr> "Alaska", "Alaska", "Alabama", "Alabama", "Ar…
## $ Population.Black <int> 153664, 153664, 5376635, 5376635, 1991165, 19…
## $ Population.White <int> 2274605, 2274605, 14267619, 14267619, 9747025…
## $ Deaths.homicide.Black <int> 23, NA, 183, 1812, 667, 90, 300, 40, 3712, 43…
## $ Deaths.homicide.White <int> 78, 10, 124, 620, 416, 110, 586, 145, 1399, 6…
## $ crude.homicide.Black <dbl> 14.97, NA, 3.40, 33.70, 33.50, 4.52, 22.06, 2…
## $ crude.homicide.White <dbl> 3.43, NA, 0.87, 4.35, 4.27, 1.13, 3.46, 0.86,…
## $ adjusted.homicide.Black <dbl> 12.30, NA, 3.51, 33.00, 33.39, 4.81, 20.40, 2…
## $ adjusted.homicide.White <dbl> 3.24, NA, 0.85, 4.47, 4.39, 1.13, 3.63, 0.91,…
## $ Deaths.suicide.Black <int> 23, 12, 148, 370, 118, 75, 101, 80, 492, 639,…
## $ Deaths.suicide.White <int> 535, 196, 1222, 3195, 2154, 1027, 4146, 1972,…
## $ crude.suicide.Black <dbl> 14.97, NA, 2.75, 6.88, 5.93, 3.77, 7.43, 5.88…
## $ crude.suicide.White <dbl> 23.52, 8.62, 8.56, 22.39, 22.10, 10.54, 24.47…
## $ adjusted.suicide.Black <dbl> 12.77, NA, 2.84, 7.20, 6.25, 3.93, 7.51, 5.58…
## $ adjusted.suicide.White <dbl> 23.47, 8.21, 8.64, 20.97, 20.98, 10.89, 21.42…
## $ type <chr> "Firearm", "Non-Firearm", "Non-Firearm", "Fir…
## $ crude.RD.suicide <dbl> -8.55, NA, -5.81, -15.51, -16.17, -6.77, -17.…
## $ adj.RD.suicide <dbl> -10.70, NA, -5.80, -13.77, -14.73, -6.96, -13…
## $ crude.RD.homicide <dbl> 11.54, NA, 2.53, 29.35, 29.23, 3.39, 18.60, 2…
## $ adj.RD.homicide <dbl> 9.06, NA, 2.66, 28.53, 29.00, 3.68, 16.77, 1.…
## $ ST.order.RD.homicide <chr> "AK", "AK", "AL", "AL", "AR", "AR", "AZ", "AZ…
## $ ST.order.RD.suicide <chr> "AK", "AK", "AL", "AL", "AR", "AR", "AZ", "AZ…
## $ gun.house.prev <dbl> 59.8, 59.8, 52.2, 52.2, 58.8, 58.8, 32.3, 32.…
## $ gun.house.prev.category <chr> "45.0%-65.5%", "45.0%-65.5%", "45.0%-65.5%", …
## $ average.pop.white <dbl> 252734, 252734, 1585291, 1585291, 1083003, 10…
## $ average.pop.black <dbl> 17074, 17074, 597404, 597404, 221241, 221241,…
## $ type.fac <chr> "Firearm-related", "Firearm-unrelated", "Fire…
## Rows: 49
## Columns: 5
## $ ST <chr> "AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DE…
## $ adjusted.suicide.White <dbl> 23.47, 20.97, 20.98, 21.42, 11.63, 19.48, 6.4…
## $ adjusted.homicide.White <dbl> 3.24, 4.47, 4.39, 3.63, 2.05, 1.75, 0.86, 1.7…
## $ gun.house.prev.category <chr> "45.0%-65.5%", "45.0%-65.5%", "45.0%-65.5%", …
## $ average.pop.white <dbl> 252734, 1585291, 1083003, 1882345, 7703022, 1…
Displays a figure we need to reproduce that shows the top ten cities in highest amounts raised in political contributions in California during the 2016 US Presidential election.
To get this plot, you must join two dataframes; the one you have with all contributions, and data that can translate zipcodes to cities. You can find a file with all US zipcodes, e.g., here http://www.uszipcodelist.com/download.html.
The easiest way would be to create two plots and then place one next to each other. For this, you will need the patchwork package. https://cran.r-project.org/web/packages/patchwork/index.html
While this is ok, what if one asked you to create the same plot for the top 10 candidates and not just the top two? The most challenging part is how to reorder within categories, and for this you will find Julia Silge’s post on REORDERING AND FACETTING FOR GGPLOT2 useful.
# Make sure you use vroom() as it is significantly faster than read.csv()
CA_contributors_2016 <- vroom::vroom(here::here("data","CA_contributors_2016.csv"))
zipcodes_2016 <- vroom::vroom(here::here("data","zip_code_database.csv"))
# Mutating the zipcodes and merging with CA_contributors_2016 dataset.
zipcodes_2016 <- zipcodes_2016 %>%
mutate(zip=as.double(zip))
merged_dataframe <- left_join( CA_contributors_2016, zipcodes_2016, by="zip")
# Table showing total amount of contribution received by city in descending order.
merged_dataframe %>%
group_by(primary_city) %>%
summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>% slice(1:10)## # A tibble: 10 x 2
## primary_city total_amount
## <chr> <dbl>
## 1 Los Angeles 16608405.
## 2 San Francisco 15716677.
## 3 Oakland 3940076.
## 4 San Diego 3932830.
## 5 Palo Alto 3375370.
## 6 Beverly Hills 3267148.
## 7 Berkeley 3104403.
## 8 Santa Monica 2926874.
## 9 San Jose 2445544.
## 10 Sacramento 2384442.
library(patchwork)
library(dplyr)
# Table showing total contributions received by candidate in descending order.
merged_dataframe %>%
group_by(cand_nm) %>%
summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>% slice(1:10)## # A tibble: 10 x 2
## cand_nm total_amount
## <chr> <dbl>
## 1 Clinton, Hillary Rodham 94784569.
## 2 Sanders, Bernard 20649084.
## 3 Trump, Donald J. 14064776.
## 4 Cruz, Rafael Edward 'Ted' 6075175.
## 5 Rubio, Marco 5595281.
## 6 Bush, Jeb 3357550.
## 7 Carson, Benjamin S. 3145278.
## 8 Kasich, John R. 1571849.
## 9 Fiorina, Carly 1501113.
## 10 Paul, Rand 823542.
# Table showing the top ten cities where Hillary Clinton received the most contributions.
CA_2016_Clinton <- merged_dataframe %>% filter(cand_nm == "Clinton, Hillary Rodham") %>% group_by(primary_city) %>%
summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>%
mutate(primary_city = fct_reorder(primary_city, total_amount))%>% slice (1:10)
CA_2016_Clinton## # A tibble: 10 x 2
## primary_city total_amount
## <fct> <dbl>
## 1 San Francisco 12345963.
## 2 Los Angeles 12022399.
## 3 Oakland 2955387.
## 4 Palo Alto 2664346.
## 5 Berkeley 2300980.
## 6 Beverly Hills 2226442.
## 7 Santa Monica 2134503.
## 8 San Diego 2094960.
## 9 Sacramento 1623772.
## 10 Los Altos 1500963.
# Table showing the top ten cities where Donald Trump received the most contributions.
CA_2016_trump <- merged_dataframe %>% filter(cand_nm == "Trump, Donald J.") %>% group_by(primary_city) %>%
summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount))%>%
mutate(primary_city = fct_reorder(primary_city, total_amount))%>% slice (1:10)
CA_2016_trump## # A tibble: 10 x 2
## primary_city total_amount
## <fct> <dbl>
## 1 Los Angeles 544464.
## 2 San Diego 482885.
## 3 Newport Beach 347206.
## 4 Bakersfield 244419.
## 5 San Francisco 230712.
## 6 San Jose 190652.
## 7 Fresno 182674.
## 8 Rancho Santa Fe 178233.
## 9 Sacramento 171656.
## 10 Irvine 157518.
# Plot showing the top ten cities where Hillary Clinton received the most contributions in in descending order.
Plot_clinton<- ggplot(CA_2016_Clinton, aes(x=total_amount, y=primary_city)) +
geom_col() +
theme_minimal() +
labs(x ="" , y = "", title = "Clinton, Hillary Rodham")
Plot_clinton# Plot showing the top ten cities where Donald Trump received the most contributions in in descending order.
Plot_trump<- ggplot(CA_2016_trump, aes(x=total_amount, y=primary_city)) +
geom_col() +
theme_minimal() +
labs(x ="" , y = "", title = "Trump, Donald J.")
Plot_trumplibrary(patchwork)
# Compares the top ten cities where Donald Trump and Hillary Clinton received the most contributions in in descending order, displaying them side by side.
wrap_plots(Plot_clinton, Plot_trump)library(tidytext)
# Shows the top ten candidates ranked by contributions to their campaigns in descending order.
top_cand <- merged_dataframe %>%
group_by(cand_nm) %>%
summarise(total_amount=sum(contb_receipt_amt)) %>% arrange(desc(total_amount)) %>% slice(1:10)
top_cand## # A tibble: 10 x 2
## cand_nm total_amount
## <chr> <dbl>
## 1 Clinton, Hillary Rodham 94784569.
## 2 Sanders, Bernard 20649084.
## 3 Trump, Donald J. 14064776.
## 4 Cruz, Rafael Edward 'Ted' 6075175.
## 5 Rubio, Marco 5595281.
## 6 Bush, Jeb 3357550.
## 7 Carson, Benjamin S. 3145278.
## 8 Kasich, John R. 1571849.
## 9 Fiorina, Carly 1501113.
## 10 Paul, Rand 823542.
top_cand_10 <-merged_dataframe %>%
filter(cand_nm %in% top_cand$cand_nm) %>%
group_by(primary_city, cand_nm) %>%
summarise(total_amount =sum(contb_receipt_amt)) %>%
group_by(cand_nm) %>%
slice_max(total_amount,n=10) %>%
ungroup %>%
mutate(cand_nm = as.factor(cand_nm),
primary_city = reorder_within(primary_city, total_amount, cand_nm))
# Plots the top ten candidates by contributions received by their campaign across primary cities in descending order.
Plot_top_10 <- ggplot(top_cand_10, aes(primary_city, total_amount)) +
geom_col(show.legend = FALSE) +
facet_wrap(~cand_nm, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme_bw()+
scale_y_continuous(expand = c(0,0)) +
theme (axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(angle = 90))
Plot_top_10